29 June, 2017

suppressWarnings(library(tidyverse))
suppressWarnings(library(magrittr))
suppressWarnings(library(knitr))
suppressWarnings(library(arsRtools))
unscaled_tss_mat_file <- '/Users/schmidm/Documents/other_people_to_and_from/ClaudiaI/MS_PASHA/unscaled_deeptools_out/Refseqv3_single_isoform_pc_gene_allq_TSSpm5k_matrix.gz'

biotype <- 'pc_allquintiles'
anchor <- 'TSS'

matrix files for ChIPseq all quintiles: /Users/schmidm/Documents/other_people_to_and_from/ClaudiaI/MS_PASHA/unscaled_deeptools_out/Refseqv3_single_isoform_pc_gene_allq_TSSpm5k_matrix.gz
biotype: pc_allquintiles
anchor: TSS

samples table:

data("chip_samples_table", package = 'arsRtools')

manual scales

data("Refseq_pc_g2k_q1_second_half_body_scales", package='arsRtools')

simple plot raw values theme

data('lineplot_theme', package='arsRtools')

Deeptools to R

code not run only shown for documentation

This part is not run here, just shown for Documentation. The resulting tidy data frame has been saved and is loaded in the next section.

#!/bin/sh

cd ~/faststorage/ClaudiaI/Pasha_Results/unscaled_wigs

computeMatrix reference-point \
-R "/home/schmidm/annotations/hg19/RefSeqGRCh37.3/Refseq_single_isoform_pc_genes_allquintiles.bed" \
-S *_[P,I][o,n]*.bw \
--referencePoint=TSS --upstream=5000 --downstream=5000 \
--averageTypeBins=sum --missingDataAsZero --binSize=50 \
--outFileName deeptools_out/Refseqv3_single_isoform_pc_gene_allq_TSSpm5k_matrix.gz \
--numberOfProcessors 4

echo "done"

Load Deeptools heatmap

code not run only shown for documentation

bin_values <- arsRtools::load_chip_matrix(unscaled_tss_mat_file, chip_samples_table) 
bin_values_filename <- paste0('../data/ChIPseq/ChIPseq_bin_values_', biotype, '_', anchor, '.RData')

Saving bin values to file: ../data/ChIPseq/ChIPseq_bin_values_pc_allquintiles_TSS.RData

save(bin_values, file=bin_values_filename)

Load the tidy data frame of single 50bp bin values

everything below is run.

load(bin_values_filename, verbose=TRUE)
bin_values

load the quintile information

from Figure S0a Rmd file…

load('../data/ChIPseq/Refseq.3_protein_coding_genes_promoter_ChIP_signal.RData', verbose=TRUE)
pc_ChIP_rel_input

get only quintile information for fusing with bin_values

(pc_ChIP_rel_input %<>%
  unite(id, c(gene_id, rna_id, Refseq_id, gene_name), sep=':') %>%
  dplyr::select(id, quintile))
(bin_values %<>% left_join(., pc_ChIP_rel_input) %>%
   filter(!is.na(quintile)))

BGsub

BGsub and scaling is normally part of the arsRtools::scale_chip_matrix function. Here the individual code parts are shown:
Note though that quintiles are normally not relevant and dropped in the scale_chip_matrix function.

first split inputs and ips

(inputs <- dplyr::filter(bin_values, ab=='Input') %>%
    dplyr::select(id, rel_pos, sample_name, value) %>%
    dplyr::rename(used_input = sample_name, Input = value))
(ips <- dplyr::filter(bin_values, ab=='PolII') %>%
    dplyr::select(id, quintile, siRNA, ab, main_run, run, rel_pos, sample_name, used_input, value) %>%
    dplyr::rename(ChIP = value))

Here we use Refseq_pc_g2k_q1_second_half_body_scales from package arsRtools for scaling

(BGsub <- left_join(ips, inputs) %>%
    dplyr::select(id, quintile, siRNA, ab, main_run, run, rel_pos, ChIP, Input) %>%
    left_join(., Refseq_pc_g2k_q1_second_half_body_scales) %>%
    mutate(scaled_input = Input*input_scale_factor,
           BGSub_ChIP = ChIP-scaled_input))

This will result in all ChIP samples having quintile 5 at an average value of ~ 0.

scaling

Scaling tries to adjust the ChIP samples from different runs between each other. This is done using the gene body information from quintile 1. One can see the effect quite clearly on TSS-proximal metagene profiles for quintile 1..

(scaled_BGsub <- BGsub %>%
    mutate(scaled_BGSub_ChIP = BGSub_ChIP * chip_scale_factor,
           siRNA = factor(siRNA, levels=c('siARS2', 'siFFL', 'siRRP40', 'siZ18')),
           run = factor(run, levels=c('CI62', 'CI63',
                                      'MA72_z18', 'MA73_z18',
                                      'MA72_rrp40', 'MA73_rrp40')),
           main_run = factor(main_run, levels=c('CI siARS2',
                                                'MA siZ18',
                                                'MA siRRP40'))) %>%
    dplyr::select(id, quintile, main_run, run, siRNA, rel_pos, scaled_BGSub_ChIP))

This will result in all ChIP samples having gene body values, ie TSS-distal region of quintile 1 at average at a similar value of ~ 1

save the BGsub values

save(scaled_BGsub, file='../data/Refseq.3_protein_coding_genes_ChIP_scaled_rel2ndhalf_BGsub_values_relTSS.RData')

everything below is run

Metagene plots and heatmap from scaled_BGsub data

load('../data/Refseq.3_protein_coding_genes_ChIP_scaled_rel2ndhalf_BGsub_values_relTSS.RData', verbose=TRUE)
## Loading objects:
##   scaled_BGsub
scaled_BGsub
## # A tibble: 33,297,600 × 7
##                                          id quintile  main_run    run
##                                       <chr>   <fctr>    <fctr> <fctr>
## 1    gene1:rna0:XM_003403543.1:LOC100652771        4 CI siARS2   CI62
## 2           gene7:rna4:NM_001005484.1:OR4F5        4 CI siARS2   CI62
## 3   gene13:rna7:XM_002344251.2:LOC100288646        4 CI siARS2   CI62
## 4         gene14:rna8:NM_001005221.2:OR4F29        5 CI siARS2   CI62
## 5        gene17:rna10:NM_001005277.1:OR4F16        4 CI siARS2   CI62
## 6  gene18:rna11:XM_002342011.2:LOC100287654        4 CI siARS2   CI62
## 7           gene31:rna22:NM_152486.2:SAMD11        3 CI siARS2   CI62
## 8            gene32:rna23:NM_015658.3:NOC2L        3 CI siARS2   CI62
## 9           gene33:rna24:NM_198317.2:KLHL17        3 CI siARS2   CI62
## 10           gene38:rna30:NM_005101.3:ISG15        3 CI siARS2   CI62
## # ... with 33,297,590 more rows, and 3 more variables: siRNA <fctr>,
## #   rel_pos <dbl>, scaled_BGSub_ChIP <dbl>

use only genes > 7kb

get gene length info

load('../data/annotations/Refseq_single_isoform_pc_genes.RData', verbose=TRUE)
## Loading objects:
##   mRNA_genes
mRNA_genes
## # A tibble: 13,944 × 8
##       chr  start    end gene_id rna_id      Refseq_id  score strand
## *  <fctr>  <int>  <int>   <chr>  <chr>          <chr> <fctr> <fctr>
## 1    chr1  12190  13639   gene1   rna0 XM_003403543.1      .      +
## 2    chr1  69091  70008   gene7   rna4 NM_001005484.1      .      +
## 3    chr1 329790 342507  gene13   rna7 XM_002344251.2      .      +
## 4    chr1 367659 368597  gene14   rna8 NM_001005221.2      .      +
## 5    chr1 621096 622034  gene17  rna10 NM_001005277.1      .      -
## 6    chr1 647187 659924  gene18  rna11 XM_002342011.2      .      -
## 7    chr1 861121 879961  gene31  rna22    NM_152486.2      .      +
## 8    chr1 879583 894679  gene32  rna23    NM_015658.3      .      -
## 9    chr1 895967 901099  gene33  rna24    NM_198317.2      .      +
## 10   chr1 948847 949920  gene38  rna30    NM_005101.3      .      +
## # ... with 13,934 more rows
(gene_len_df <- mRNA_genes %>%
  mutate(gene_length = end -start) %>%
  dplyr::select(gene_id, gene_length))
## # A tibble: 13,944 × 2
##    gene_id gene_length
##      <chr>       <int>
## 1    gene1        1449
## 2    gene7         917
## 3   gene13       12717
## 4   gene14         938
## 5   gene17         938
## 6   gene18       12737
## 7   gene31       18840
## 8   gene32       15096
## 9   gene33        5132
## 10  gene38        1073
## # ... with 13,934 more rows

remove shorter genes

(scaled_BGsub %<>%
  mutate(gene_id = sub(':.*', '', id)) %>%
  left_join(., gene_len_df) %>%
  filter(gene_length > 7000))
## # A tibble: 22,644,000 × 9
##                                          id quintile  main_run    run
##                                       <chr>   <fctr>    <fctr> <fctr>
## 1   gene13:rna7:XM_002344251.2:LOC100288646        4 CI siARS2   CI62
## 2  gene18:rna11:XM_002342011.2:LOC100287654        4 CI siARS2   CI62
## 3           gene31:rna22:NM_152486.2:SAMD11        3 CI siARS2   CI62
## 4            gene32:rna23:NM_015658.3:NOC2L        3 CI siARS2   CI62
## 5             gene39:rna31:NM_198576.2:AGRN        5 CI siARS2   CI62
## 6         gene41:rna33:NM_017891.4:C1orf159        3 CI siARS2   CI62
## 7        gene54:rna53:NM_001130413.3:SCNN1D        4 CI siARS2   CI62
## 8            gene55:rna55:NM_030649.2:ACAP3        1 CI siARS2   CI62
## 9           gene57:rna57:NM_017871.4:CPSF3L        1 CI siARS2   CI62
## 10            gene60:rna60:NM_004421.2:DVL1        1 CI siARS2   CI62
## # ... with 22,643,990 more rows, and 5 more variables: siRNA <fctr>,
## #   rel_pos <dbl>, scaled_BGSub_ChIP <dbl>, gene_id <chr>,
## #   gene_length <int>

metagene line plot

scaled_BGsub_metagene <- scaled_BGsub %>%
  group_by(siRNA, run, main_run, rel_pos, quintile) %>%
  summarize(mean_scaled_BGSub_value = mean(scaled_BGSub_ChIP)) 
scaled_BGsub_metagene %>%
  filter(!is.na(quintile)) %>%
ggplot(.) +
  geom_line(aes(x=rel_pos, y=mean_scaled_BGSub_value, color=siRNA), alpha=.7) +
  facet_grid(quintile ~ run) +
  lineplot_theme +
  xlim(-2000,5000) +
  ylab('scaled BGsub ChIP value')

showing both replicates together with area in between colored

scaled_BGsub_metagene %>%
  group_by(quintile, siRNA, main_run, rel_pos) %>%
  mutate(upper = max(mean_scaled_BGSub_value), lower = min(mean_scaled_BGSub_value), center = mean(mean_scaled_BGSub_value)) %>%
ggplot(.) +
  geom_line(aes(x=rel_pos, y=mean_scaled_BGSub_value, color=siRNA), alpha=.7) +
  geom_ribbon(aes(x=rel_pos, ymin=lower, ymax=upper, fill=siRNA), alpha=.7 ) +
  facet_grid(quintile~main_run) +
  lineplot_theme +
  xlim(-2000,5000) +
  ylab('scaled BGsub ChIP value')

heatmap all quintiles

min_val <- min(scaled_BGsub$scaled_BGSub_ChIP[scaled_BGsub$scaled_BGSub_ChIP>0])
  
scaled_BGsub_ratio <- scaled_BGsub %>%
  mutate(KD = ifelse(siRNA == 'siFFL', 'siFFL', 'KD'),
         scaled_BGSub_ChIP = ifelse(scaled_BGSub_ChIP <= 0, min_val, scaled_BGSub_ChIP+min_val)) %>%
  dplyr::select(id, quintile, rel_pos, KD, main_run, scaled_BGSub_ChIP) %>%
  group_by(KD, id, quintile, main_run, KD, rel_pos) %>%
  summarize(scaled_BGSub_ChIP = mean(scaled_BGSub_ChIP)) %>%
  spread(KD, scaled_BGSub_ChIP) %>%
  mutate(log2_ratio = log2(KD)-log2(siFFL) )
data('RNAseq_logFC_heatmap_theme', package='arsRtools')
p <- scaled_BGsub_ratio %>%
    filter(!is.na(log2_ratio),
         rel_pos > -2000, rel_pos < 5000) %>%
    ungroup() %>%
  mutate(log2_ratio = case_when(.$log2_ratio > 2 ~ 2,
                                .$log2_ratio < -2 ~ -2,
                                TRUE ~ .$log2_ratio)) %>%
  ggplot(., aes(x=rel_pos, y=id, fill=log2_ratio)) +
  geom_raster(interpolate = FALSE) +
  facet_grid(quintile~main_run, scales='free') +
  RNAseq_logFC_heatmap_theme +
  theme(axis.text.y = element_blank())
p +
  ylab('log2FC based on values scaled to pc gene body')

all_zero_ids <- scaled_BGsub_ratio %>%
  group_by(id) %>%
  summarize(min_value = min(log2_ratio),
            max_value = max(log2_ratio)) %>%
  filter(min_value == 0, max_value == 0) %$%
  id
p <- scaled_BGsub_ratio %>%
    filter(!is.na(log2_ratio),
           !(id %in% all_zero_ids),
         rel_pos > -2000, rel_pos < 5000) %>%
    ungroup() %>%
  mutate(log2_ratio = case_when(.$log2_ratio > 2 ~ 2,
                                .$log2_ratio < -2 ~ -2,
                                TRUE ~ .$log2_ratio)) %>%
  ggplot(., aes(x=rel_pos, y=id, fill=log2_ratio)) +
  geom_raster(interpolate = FALSE) +
  facet_grid(quintile~main_run, scales='free') +
  RNAseq_logFC_heatmap_theme +
  theme(axis.text.y = element_blank())
p +
  ylab('log2FC based on values scaled to pc gene body')

sessionInfo()
## R version 3.2.3 (2015-12-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] arsRtools_0.1        RColorBrewer_1.1-2   rtracklayer_1.30.4  
##  [4] GenomicRanges_1.22.4 GenomeInfoDb_1.6.3   IRanges_2.4.8       
##  [7] S4Vectors_0.8.11     BiocGenerics_0.16.1  knitr_1.15.1        
## [10] magrittr_1.5         dplyr_0.5.0          purrr_0.2.2         
## [13] readr_1.0.0          tidyr_0.6.0          tibble_1.2          
## [16] ggplot2_2.2.0        tidyverse_1.0.0     
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.8                futile.logger_1.4.3       
##  [3] plyr_1.8.4                 XVector_0.10.0            
##  [5] futile.options_1.0.0       bitops_1.0-6              
##  [7] tools_3.2.3                zlibbioc_1.16.0           
##  [9] digest_0.6.10              evaluate_0.10             
## [11] gtable_0.2.0               DBI_0.5-1                 
## [13] yaml_2.1.14                stringr_1.1.0             
## [15] Biostrings_2.38.4          rprojroot_1.1             
## [17] grid_3.2.3                 Biobase_2.30.0            
## [19] R6_2.2.0                   BiocParallel_1.4.3        
## [21] XML_3.98-1.5               rmarkdown_1.2             
## [23] reshape2_1.4.2             lambda.r_1.1.9            
## [25] GenomicAlignments_1.6.3    Rsamtools_1.22.0          
## [27] backports_1.0.4            scales_0.4.1              
## [29] htmltools_0.3.5            SummarizedExperiment_1.0.2
## [31] assertthat_0.1             colorspace_1.3-2          
## [33] labeling_0.3               stringi_1.1.2             
## [35] RCurl_1.95-4.8             lazyeval_0.2.0            
## [37] munsell_0.4.3